import fastlmm
import h5py
import numpy as np
import time
# set degree of verbosity (adapt to INFO for more verbose output)
import logging
logging.basicConfig(level=logging.WARNING)
# set figure sizes
import pylab
pylab.rcParams['figure.figsize'] = (10.0, 8.0)
# set display width for pandas data frames
import pandas as pd
pd.set_option('display.width', 1000)
# import the algorithm
from fastlmm.association import single_snp
from pysnptools.snpreader import Pheno, Bed, SnpData
import fastlmm.util.util as flutil
from pysnptools.snpreader import SnpHdf5
from fastlmm.association import single_snp_linreg
bed_fn = "imputed_snps_binary.hdf5"
pheno_fn = "phenotype.txt"
#Loading the SNP data
f = h5py.File("1001_SNP_MATRIX/imputed_snps_binary.hdf5","r")
#Retrieving the data from the hdf5 snp file.
snps = f["snps"]
accessions = f["accessions"]
positions = f["positions"]
# Details of chromosomes
chr_regions = positions.attrs["chr_regions"]
indices_for_chr1 = chr_regions[0]
print(indices_for_chr1)
#Accessions which are available in 1001genomes org with their respective ids
df = pd.read_csv("AvailableMappedAccessions.csv")
#indices list of accessions that will be used for GWAS
accessions = accessions[:]
accs = accessions.astype(np.int64)
indices_of_accessions = [ np.where(accs == df["Accession Id"][i])[0][0] for i in range(len(df))]
indices_of_accessions = sorted(indices_of_accessions)
print (indices_of_accessions, len(indices_of_accessions))
#Snp data for creating SNP data matrix
#We need iid, sid, and vals
#iid is required to create SNP Data
iid = [[i+1, sorted(df["Accession Name"])[i]] for i in range(len(df))]
print ("iids : ", iid)
genes = snps[:, indices_of_accessions]
#Filtering genes for chromosome 1
genes = genes[: indices_for_chr1[-1]]
threshold= 5
genes = np.transpose(genes)
alleles = np.where(genes.sum(axis=0) >threshold)
snps = genes[:,alleles[0]]
print ("snps shape", snps.shape)
number_of_genes_discarded = genes.shape[1] - snps.shape[1]
print ("number of genes discarded", number_of_genes_discarded)
#sid is required to create SNP data
sid = range(snps.shape[1])
print(sid[:5])
#it's important create snps positions
pos = []
for i in sid:
pos.append([1,i, i])
pos = np.array(pos) #Converting into a numpy array
snps = snps.astype(np.float64)
snps.dtype
#Creating SNP Data
snp_data = SnpData(iid = iid, sid = sid,
val = snps, pos = pos)
snp_data.sid_count
snp_data.read().val[0]
#Working with Phenotypes
pheno_fn = Pheno("phenotypes.txt")
#Verifying Phenotypes
# pheno_vals = pheno_fn.read().val
# print(pheno_vals)
results_dataframe = single_snp(test_snps=snp_data, pheno=pheno_fn, count_A1=False)
results_dataframe.head()
# manhattan plot
import pylab
import fastlmm.util.util as flutil
flutil.manhattan_plot(results_dataframe[["Chr", "ChrPos", "PValue"]],pvalue_line=1e-5,xaxis_unit_bp=False)
pylab.show()
#Working with Phenotypes For feature 2
pheno_fn = Pheno("phenotypesFeature2.txt")
t0 = time.time()
results_dataframe_feature2 = single_snp(test_snps=snp_data, pheno=pheno_fn, count_A1=False)
t1 = time.time()
print ("Time taken for analysis =", str(t1-t0),"seconds")
flutil.manhattan_plot(results_dataframe_feature2[["Chr", "ChrPos", "PValue"]],pvalue_line=1e-5,xaxis_unit_bp=False)
pylab.show()
#Working with Phenotypes For feature 3
pheno_fn = Pheno("phenotypesFeature3.txt")
t0 = time.time()
results_dataframe_feature3 = single_snp(test_snps=snp_data, pheno=pheno_fn, count_A1=False)
t1 = time.time()
print ("Time taken for analysis =", str(t1-t0),"seconds")
flutil.manhattan_plot(results_dataframe_feature3[["Chr", "ChrPos", "PValue"]],pvalue_line=1e-5,xaxis_unit_bp=False)
pylab.show()
#Working with Phenotypes For feature 4
pheno_fn = Pheno("phenotypesFeature4.txt")
t0 = time.time()
results_dataframe_feature4 = single_snp(test_snps=snp_data, pheno=pheno_fn, count_A1=False)
t1 = time.time()
print ("Time taken for analysis =", str(t1-t0),"seconds")
flutil.manhattan_plot(results_dataframe_feature4[["Chr", "ChrPos", "PValue"]],pvalue_line=1e-5,xaxis_unit_bp=False)
pylab.show()
#Working with Phenotypes For feature 5
pheno_fn = Pheno("phenotypesFeature5.txt")
t0 = time.time()
results_dataframe_feature5 = single_snp(test_snps=snp_data, pheno=pheno_fn, count_A1=False)
t1 = time.time()
print ("Time taken for analysis =", str(t1-t0),"seconds")
flutil.manhattan_plot(results_dataframe_feature5[["Chr", "ChrPos", "PValue"]],pvalue_line=1e-5,xaxis_unit_bp=False)
pylab.show()
#Working with Phenotypes For feature 6
pheno_fn = Pheno("phenotypesFeature6.txt")
t0 = time.time()
results_dataframe_feature6 = single_snp(test_snps=snp_data, pheno=pheno_fn, count_A1=False)
t1 = time.time()
print ("Time taken for analysis =", str(t1-t0),"seconds")
flutil.manhattan_plot(results_dataframe_feature6[["Chr", "ChrPos", "PValue"]],pvalue_line=1e-5,xaxis_unit_bp=False)
pylab.show()
#Working with Phenotypes For feature 7
pheno_fn = Pheno("phenotypesFeature7.txt")
t0 = time.time()
results_dataframe_feature7 = single_snp(test_snps=snp_data, pheno=pheno_fn, count_A1=False)
t1 = time.time()
print ("Time taken for analysis =", str(t1-t0),"seconds")
flutil.manhattan_plot(results_dataframe_feature7[["Chr", "ChrPos", "PValue"]],pvalue_line=1e-5,xaxis_unit_bp=False)
pylab.show()
#Working with Phenotypes For feature 8
pheno_fn = Pheno("phenotypesFeature8.txt")
t0 = time.time()
results_dataframe_feature8 = single_snp(test_snps=snp_data, pheno=pheno_fn, count_A1=False)
t1 = time.time()
print ("Time taken for analysis =", str(t1-t0),"seconds")
flutil.manhattan_plot(results_dataframe_feature8[["Chr", "ChrPos", "PValue"]],pvalue_line=1e-5,xaxis_unit_bp=False)
pylab.show()
#Working with Phenotypes For feature 9
pheno_fn = Pheno("phenotypesFeature9.txt")
t0 = time.time()
results_dataframe_feature9 = single_snp(test_snps=snp_data, pheno=pheno_fn, count_A1=False)
t1 = time.time()
print ("Time taken for analysis =", str(t1-t0),"seconds")
flutil.manhattan_plot(results_dataframe_feature9[["Chr", "ChrPos", "PValue"]],pvalue_line=1e-5,xaxis_unit_bp=False)
pylab.show()
#Working with Phenotypes For feature 10
pheno_fn = Pheno("phenotypesFeature10.txt")
t0 = time.time()
results_dataframe_feature10 = single_snp(test_snps=snp_data, pheno=pheno_fn, count_A1=False)
t1 = time.time()
print ("Time taken for analysis =", str(t1-t0),"seconds")
flutil.manhattan_plot(results_dataframe_feature10[["Chr", "ChrPos", "PValue"]],pvalue_line=1e-5,xaxis_unit_bp=False)
pylab.show()
#Working with Phenotypes For feature 11
pheno_fn = Pheno("phenotypesFeature11.txt")
t0 = time.time()
results_dataframe_feature11 = single_snp(test_snps=snp_data, pheno=pheno_fn, count_A1=False)
t1 = time.time()
print ("Time taken for analysis =", str(t1-t0),"seconds")
flutil.manhattan_plot(results_dataframe_feature11[["Chr", "ChrPos", "PValue"]],pvalue_line=1e-5,xaxis_unit_bp=False)
pylab.show()
#Working with Phenotypes For feature 12
pheno_fn = Pheno("phenotypesFeature12.txt")
t0 = time.time()
results_dataframe_feature12 = single_snp(test_snps=snp_data, pheno=pheno_fn, count_A1=False)
t1 = time.time()
print ("Time taken for analysis =", str(t1-t0),"seconds")
flutil.manhattan_plot(results_dataframe_feature12[["Chr", "ChrPos", "PValue"]],pvalue_line=1e-5,xaxis_unit_bp=False)
pylab.show()
#Working with Phenotypes For feature 13
pheno_fn = Pheno("phenotypesFeature13.txt")
t0 = time.time()
results_dataframe_feature13 = single_snp(test_snps=snp_data, pheno=pheno_fn, count_A1=False)
t1 = time.time()
print ("Time taken for analysis =", str(t1-t0),"seconds")
flutil.manhattan_plot(results_dataframe_feature13[["Chr", "ChrPos", "PValue"]],pvalue_line=1e-5,xaxis_unit_bp=False)
pylab.show()
#Working with Phenotypes for features 14-64
phenotype_file_name = "phenotypesFeature"
suffix = ".txt"
def gwasAnalysisWithManhattanPlot(pheno_fn):
t0 = time.time()
results_dataframe_feature_ = single_snp(test_snps=snp_data, pheno=pheno_fn, count_A1=False)
t1 = time.time()
print ("Time taken for analysis =", str(t1-t0),"seconds")
flutil.manhattan_plot(results_dataframe_feature_[["Chr", "ChrPos", "PValue"]],pvalue_line=1e-5,xaxis_unit_bp=False)
pylab.show()
for feature in range(14,65):
pheno_fn = phenotype_file_name + str(feature) + suffix
# print(pheno_fn)
print("Single SNP GWAS on" +phenotype_file_name+str(feature))
gwasAnalysisWithManhattanPlot(pheno_fn)
#QQ-Plots
from fastlmm.util.stats import plotp
#Working with Phenotypes for features 14-64
phenotype_file_name = "phenotypesFeature"
suffix = ".txt"
def gwasAnalysisWithQQPlot(pheno_fn):
t0 = time.time()
results_dataframe_feature_ = single_snp(test_snps=snp_data, pheno=pheno_fn, count_A1=False)
t1 = time.time()
print ("Time taken for analysis =", str(t1-t0),"seconds")
print("\nQQ Plot")
plotp.qqplot(results_dataframe_feature_["PValue"].values, xlim=[0,5], ylim=[0,5])
pylab.show()
for feature in range(1,65):
pheno_fn = phenotype_file_name + str(feature) + suffix
# print(pheno_fn)
print("Single SNP GWAS on" +phenotype_file_name+str(feature))
gwasAnalysisWithQQPlot(pheno_fn)